import numpy as np
import shapefile as shp
import pandas as pd
from pandas.api.types import CategoricalDtype
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")
import zipfile
import os
from ds100_utils import run_linear_regression_test
from sklearn import linear_model
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
import descartes
import geopandas as gpd
from shapely.geometry import Point, Polygon
from IPython.display import Image
# Plot settings
plt.rcParams['figure.figsize'] = (12, 9)
plt.rcParams['font.size'] = 12
#first we loaded the Cook County training data. We used the training data over the sale data
#because the training data includes Sale Prices. We wanted to map the most expensive homes and
#least expensive homes in the county on a map of Cook County with the historical redlining
#districts to see if there was any correlation.
with zipfile.ZipFile('cook_county_data.zip') as item:
item.extractall()
cook_county = pd.read_csv("cook_county_train.csv", index_col='Unnamed: 0')
#Looking at the 300 most expensive homes in the county. We picked the top 300 homes because
#there seems to be a huge jump and range in sale prices among the top 300. When we tried to
#use a sample size greater than 300, our map started to make incoherent patterns.
cook_county_top300=cook_county.sort_values('Sale Price', ascending=False).head(300)
cook_county_top300['Sale Price']
106057 71000000
71953 28000000
37835 12700000
929 12250000
19702 12000000
...
103193 2840000
64193 2840000
140896 2837500
50825 2830000
167480 2825000
Name: Sale Price, Length: 300, dtype: int64
geometry=[Point(xy) for xy in zip(cook_county_top300.Longitude, cook_county_top300.Latitude)]
cook_county_top300['geometry'] = geometry
#Incorporated the shape file from Mapping Inequality (https://dsl.richmond.edu/panorama/redlining/#loc=11/41.641/-87.733)
#This loaded a base map.
#Used this resource to learn how to map onto shape files: https://towardsdatascience.com/geopandas-101-plot-any-data-with-a-latitude-and-longitude-on-a-map-98e01944b972
cook_map = gpd.read_file('cartodb-query.shp')
fig,ax = plt.subplots(figsize = (15,15))
cook_map.plot(ax = ax)
geometry=[Point(xy) for xy in zip(cook_county_top300.Longitude, cook_county_top300.Latitude)]
geo_df = gpd.GeoDataFrame(geometry = geometry)
print(geo_df)
g = geo_df.plot(ax = ax, markersize = 20, color = 'red',marker = '*',label = 'Top 300')
plt.show()
geometry 0 POINT (-87.74947 41.74931) 1 POINT (-87.74617 42.11019) 2 POINT (-87.71695 42.09977) 3 POINT (-87.71283 42.09590) 4 POINT (-87.73599 42.11973) .. ... 295 POINT (-87.66717 41.95868) 296 POINT (-87.74789 42.11618) 297 POINT (-87.66328 41.95183) 298 POINT (-87.65001 41.91677) 299 POINT (-87.67095 42.03893) [300 rows x 1 columns]
#Built an actual map since the different districts couldn't be seen on the base map above.
#Code was taken from this source: https://stackoverflow.com/questions/63644131/how-to-use-geopandas-to-plot-latitude-and-longitude-on-a-more-detailed-map-with
import geopandas as gpd
import matplotlib.pyplot as plt
import contextily as ctx
from shapely.geometry import Point
ward = gpd.read_file('cartodb-query.shp', bbox=None, mask=None, rows=None)
geo_df = gpd.GeoDataFrame(geometry = geometry)
ward.crs = {'init':"epsg:4326"}
geo_df.crs = {'init':"epsg:4326"}
# plot the polygon
ax = ward.plot(alpha=0.35, color='#d66058', zorder=1)
# plot the boundary only (without fill), just uncomment
#ax = gpd.GeoSeries(ward.to_crs(epsg=3857)['geometry'].unary_union).boundary.plot(ax=ax, alpha=0.5, color="#ed2518",zorder=2)
ax = gpd.GeoSeries(ward['geometry'].unary_union).boundary.plot(ax=ax, alpha=0.5, color="#ed2518",zorder=2)
# plot the marker
ax = geo_df.plot(ax = ax, markersize = 20, color = 'red',marker = '*',label = 'Top 300', zorder=3)
ctx.add_basemap(ax, crs=geo_df.crs.to_string(), source=ctx.providers.OpenStreetMap.Mapnik)
plt.show()
#Imported an image of the redlined districts to see the different historically redlined areas.
from IPython import display
display.Image("Redlining.png")
#Source: https://digitalchicagohistory.org/files/original/1868c31f074d1ace5e7cac8f5851d60d.png